import numpy as npfrom IPython import get_ipythonfrom IPython.display import display, HTML, Markdownimport ipywidgets as widgets%matplotlib inline%config InlineBackend.figure_format = 'retina'import matplotlib.pyplot as pltimport seaborn as snssns.set()import networkx as nximport pandas as pdpd.set_option('display.max_rows', 500)pd.set_option('display.max_columns', 5000)pd.set_option('display.max_colwidth', 5000)# import plotly for visualization# Standard plotly importsimport chart_studio.plotly as pyimport plotly.graph_objs as gofrom plotly.offline import iplot, init_notebook_modeimport plotly.express as px# Using plotly + cufflinks in offline modeimport cufflinks as cfcf.go_offline()cf.set_config_file(offline=False, world_readable=True)df = pd.read_excel('GDS_EY_Badges_Earned8528efa.xlsx')df.head()df.info()#df['Badge earned'] = str(df['Badge earned'])df['Badge earned'] = pd.to_datetime(df['Badge earned'])df = df.sort_values(by = 'Badge earned')df.groupby('Badge earned').count().iplot()df.head()process_list = df['Rank Name'].to_list()#process_list.extend(df['Domain'].to_list())process_list.extend(df['Sub Domain'].to_list())process_list = (set(process_list))proc_list = pd.DataFrame(process_list, columns = ['process_list'])from sklearn.preprocessing import LabelEncoderlabelencoder_X_1 = LabelEncoder()proc_list.loc[:, 'process_list_encoded'] = labelencoder_X_1.fit_transform(proc_list.loc[:, 'process_list'])proc_list = proc_list.sort_values(by = 'process_list_encoded').reset_index(drop = True)proc_listproc_list['Type'] = 'Type1'df_node = proc_list[['process_list','process_list_encoded', 'Type']]df_node.shaperes = dict(zip(proc_list.process_list, proc_list.process_list_encoded))df['Parent_process_encoded'] = df['Rank Name'].map(res)df['New_process_encoded'] = df['Sub Domain'].map(res)#df_node = df_node.drop_duplicates()#unique_proc = pd.DataFrame({'process_count' : df.groupby([ 'Initiate a badge date','Rank Name','Domain','Sub Domain']).size()}).reset_index()df_edge = df[['Parent_process_encoded','New_process_encoded','Initiate a badge date']]#df_edge = unique_procdf_edge.head()df_edge['Initiate a badge date'] = df_edge['Initiate a badge date'].astype('str')edge1 = df_edgenode1 = df_nodeedge1.head()G = nx.from_pandas_edgelist(edge1, 'Parent_process_encoded', 'New_process_encoded', ['Parent_process_encoded', 'New_process_encoded', 'Initiate a badge date'], create_using=nx.MultiDiGraph())nx.set_node_attributes(G, node1.set_index('process_list_encoded')['process_list'].to_dict(), 'process_list')nx.set_node_attributes(G, node1.set_index('process_list_encoded')['Type'].to_dict(), 'Type')pos = nx.layout.spring_layout(G)df.head()g = nx.from_pandas_edgelist(df,source = 'Parent_process_encoded', target = 'New_process_encoded',edge_attr = ['Initiate a badge date'],create_using=nx.DiGraph())print(nx.info(g))pos=nx.spring_layout(g)nx.draw(g,pos,node_color='#A0CBE2',edge_color='#00bb5e',width=1,edge_cmap=plt.cm.Blues,with_labels=True)#No. of unique nodeslen(g.nodes)import randomedges = dict()for edge in g.edges: edges[(edge[0], edge[1])] = 1print(nx.info(g))import randommissing_edges = set([])while (len(missing_edges)< 69): a=random.randint(1, 362) b=random.randint(1, 362) tmp = edges.get((a,b),-1) if tmp == -1 and a!=b: try: if nx.shortest_path_length(g,source=a,target=b) > 2: missing_edges.add((a,b)) else: continue except: missing_edges.add((a,b)) else: continuelen(missing_edges)missing_edgesimport picklepickle.dump(missing_edges,open('missing_edges_final.p','wb'))import picklemissing_edges = pickle.load(open('missing_edges_final.p','rb'))df.head()#reading total data dfdf_pos = df[['Parent_process_encoded','New_process_encoded']]df_neg = pd.DataFrame(list(missing_edges), columns=['Parent_process_encoded', 'New_process_encoded'])#positive links i.e graphdf_pos.info()#negative links synthesized linksdf_neg.info()#Trian test split #Spiltted data into 80-20 #positive links and negative links seperatly because we need positive training data only for creating graph #and for feature generationfrom sklearn.model_selection import train_test_splitX_train_pos, X_test_pos, y_train_pos, y_test_pos = train_test_split(df_pos,np.ones(len(df_pos)),test_size=0.2, random_state=9)X_train_neg, X_test_neg, y_train_neg, y_test_neg = train_test_split(df_neg,np.zeros(len(df_neg)),test_size=0.2, random_state=9)df_train_pos = pd.DataFrame(X_train_pos)#removing header and savingdf_train_pos.to_csv('train_data.csv',header=False,index=False)#Graph from Traing data only g=nx.read_edgelist('train_data.csv',delimiter=',',create_using=nx.DiGraph(),nodetype=int)print(nx.info(g))#Graph of total data without splitting G=nx.from_pandas_edgelist(df,source = 'Parent_process_encoded', target = 'New_process_encoded',create_using=nx.DiGraph())print(nx.info(G))###total nodes in train positive datatrain_nodes_pos = set(g.nodes())###total nodes in total datatotal_nodes_pos = set(G.nodes())###test nodes in pos datatest_nodes_pos = set(X_test_pos.values.flatten())print('no of process common in train and test -- ',len(train_nodes_pos.intersection(test_nodes_pos)))print('no of process present in train but not present in test -- ',len(train_nodes_pos - test_nodes_pos))print('no of process present in test but not present in train -- ',len(test_nodes_pos - train_nodes_pos))print(' % of processes not there in Train but exist in Test in total Test data are {} %'.\ format(len(test_nodes_pos - train_nodes_pos)/len(test_nodes_pos)*100))#final train and test data setsdf_final_train = X_train_pos.append(X_train_neg,ignore_index=True)y_final_train = np.concatenate((y_train_pos,y_train_neg))df_final_test = X_test_pos.append(X_test_neg,ignore_index=True)y_final_test = np.concatenate((y_test_pos,y_test_neg))df_final_train.info()df_final_test.info()Adj = nx.adjacency_matrix(g,nodelist=sorted(g.nodes()))Adj = Adj.asfptype()Adjfrom scipy.sparse.linalg import svds, eigsU, s, V = svds(Adj,k = 3)print('U Shape',U.shape)print('V Shape',V.shape)print('s Shape',s.shape)del Adjdel s#for followeesdef jaccard_for_followees(a,b): try: if len(set(g.successors(a))) == 0 | len(set(g.successors(b))) == 0: return 0 sim = (len(set(g.successors(a)).intersection(set(g.successors(b)))))/\ (len(set(g.successors(a)).union(set(g.successors(b))))) except: return 0 return sim#for followersdef jaccard_for_followers(a,b): try: if len(set(g.predecessors(a))) == 0 | len(set(g.predecessors(b))) == 0: return 0 sim = (len(set(g.predecessors(a)).intersection(set(g.predecessors(b)))))/\ (len(set(g.predecessors(a)).union(set(g.predecessors(b))))) return sim except: return 0#for followeesdef cosine_for_followees(a,b): try: if len(set(g.successors(a))) == 0 | len(set(g.successors(b))) == 0: return 0 sim = (len(set(g.successors(a)).intersection(set(g.successors(b)))))/\ (math.sqrt(len(set(g.successors(a)))*len((set(g.successors(b)))))) return sim except: return 0def cosine_for_followers(a,b): try: if len(set(g.predecessors(a))) == 0 | len(set(g.predecessors(b))) == 0: return 0 sim = (len(set(g.predecessors(a)).intersection(set(g.predecessors(b)))))/\ (math.sqrt(len(set(g.predecessors(a))))*(len(set(g.predecessors(b))))) return sim except: return 0pr = nx.pagerank(g, alpha=0.85)pickle.dump(pr,open('page_rank.p','wb'))pr = pickle.load(open('page_rank.p','rb'))print('min',pr[min(pr, key=pr.get)])print('max',pr[max(pr, key=pr.get)])print('mean',float(sum(pr.values())) / len(pr))#for imputing to nodes which are not there in Train datamean_pr = float(sum(pr.values())) / len(pr)#if has direct edge then deleting that edge and calculating shortest pathdef compute_shortest_path_length(a,b): p=-1 try: if g.has_edge(a,b): g.remove_edge(a,b) p= nx.shortest_path_length(g,source=a,target=b) g.add_edge(a,b) else: p= nx.shortest_path_length(g,source=a,target=b) return p except: return -1#getting weekly connected edges from graph wcc=list(nx.weakly_connected_components(g))def belongs_to_same_wcc(a,b): index = [] if g.has_edge(b,a): return 1 if g.has_edge(a,b): for i in wcc: if a in i: index= i break if (b in index): g.remove_edge(a,b) if compute_shortest_path_length(a,b)==-1: g.add_edge(a,b) return 0 else: g.add_edge(a,b) return 1 else: return 0 else: for i in wcc: if a in i: index= i break if(b in index): return 1 else: return 0#adar indexdef calc_adar_in(a,b): sum=0 try: n=list(set(g.successors(a)).intersection(set(g.successors(b)))) if len(n)!=0: for i in n: sum=sum+(1/np.log10(len(list(g.predecessors(i))))) return sum else: return 0 except: return 0def follows_back(a,b): if g.has_edge(b,a): return 1 else: return 0katz = nx.katz.katz_centrality(g,alpha=0.005,beta=1)print('min',katz[min(katz, key=katz.get)])print('max',katz[max(katz, key=katz.get)])print('mean',float(sum(katz.values())) / len(katz))mean_katz = float(sum(katz.values())) / len(katz)hits = nx.hits(g, max_iter=100, tol=1e-08, nstart=None, normalized=True)#hits = pickle.load(open('hits.p','rb'))print('min',hits[0][min(hits[0], key=hits[0].get)])print('max',hits[0][max(hits[0], key=hits[0].get)])print('mean',float(sum(hits[0].values())) / len(hits[0]))pickle.dump(hits,open('hits.p','wb'))#mapping jaccrd followers to train datadf_final_train['jaccard_followers'] = df_final_train.apply(lambda row: jaccard_for_followers(row['Parent_process_encoded'],row['New_process_encoded']),axis=1)#mapping jaccrd followers to test datadf_final_test['jaccard_followers'] = df_final_test.apply(lambda row: jaccard_for_followers(row['Parent_process_encoded'],row['New_process_encoded']),axis=1)#mapping jaccrd followees to train and test datadf_final_train['jaccard_followees'] = df_final_train.apply(lambda row: jaccard_for_followees(row['Parent_process_encoded'],row['New_process_encoded']),axis=1)df_final_test['jaccard_followees'] = df_final_test.apply(lambda row: jaccard_for_followees(row['Parent_process_encoded'],row['New_process_encoded']),axis=1)#calculating no of followers followees for source and destination#calculating intersection of followers and followees for source and destinationnum_followers_s=[]num_followees_s=[]num_followers_d=[]num_followees_d=[]inter_followers=[]inter_followees=[]for i,row in df_final_train.iterrows(): try: s1=set(g.predecessors(row['Parent_process_encoded'])) s2=set(g.successors(row['Parent_process_encoded'])) except: s1 = set() s2 = set() try: d1=set(g.predecessors(row['New_process_encoded'])) d2=set(g.successors(row['New_process_encoded'])) except: d1 = set() d2 = set() num_followers_s.append(len(s1)) num_followees_s.append(len(s2)) num_followers_d.append(len(d1)) num_followees_d.append(len(d2)) inter_followers.append(len(s1.intersection(d1))) inter_followees.append(len(s2.intersection(d2)))df_final_train['num_followers_s']=num_followers_sdf_final_train['num_followees_s']=num_followees_sdf_final_train['num_followers_d']=num_followers_ddf_final_train['num_followees_d']=num_followees_ddf_final_train['inter_followers']=inter_followersdf_final_train['inter_followees']=inter_followees#For test data#calculating no of followers followees for source and destination#calculating intersection of followers and followees for source and destinationnum_followers_s=[]num_followees_s=[]num_followers_d=[]num_followees_d=[]inter_followers=[]inter_followees=[]for i,row in df_final_test.iterrows(): try: s1=set(g.predecessors(row['Parent_process_encoded'])) s2=set(g.successors(row['Parent_process_encoded'])) except: s1 = set() s2 = set() try: d1=set(g.predecessors(row['New_process_encoded'])) d2=set(g.successors(row['New_process_encoded'])) except: d1 = set() d2 = set() num_followers_s.append(len(s1)) num_followees_s.append(len(s2)) num_followers_d.append(len(d1)) num_followees_d.append(len(d2)) inter_followers.append(len(s1.intersection(d1))) inter_followees.append(len(s2.intersection(d2)))#assigningdf_final_test['num_followers_s']=num_followers_sdf_final_test['num_followees_s']=num_followees_sdf_final_test['num_followers_d']=num_followers_ddf_final_test['num_followees_d']=num_followees_ddf_final_test['inter_followers']=inter_followersdf_final_test['inter_followees']=inter_followees#saving to disk df_final_train.to_csv('df_final_train_some.csv',index=False)df_final_test.to_csv('df_final_test_some.csv',index=False)#head of df df_final_train.head()df_final_test.head()#mapping adar index on traindf_final_train['adar_index'] = df_final_train.apply(lambda row: calc_adar_in(row['Parent_process_encoded'],row['New_process_encoded']),axis=1)#mapping adar index on testdf_final_test['adar_index'] = df_final_test.apply(lambda row: calc_adar_in(row['Parent_process_encoded'],row['New_process_encoded']),axis=1)#mapping followback or not on traindf_final_train['follows_back'] = df_final_train.apply(lambda row: follows_back(row['Parent_process_encoded'],row['New_process_encoded']),axis=1)#mapping followback or not on testdf_final_test['follows_back'] = df_final_test.apply(lambda row: follows_back(row['Parent_process_encoded'],row['New_process_encoded']),axis=1)##mapping same component of wcc or not on traindf_final_train['same_comp'] = df_final_train.apply(lambda row: belongs_to_same_wcc(row['Parent_process_encoded'],row['New_process_encoded']),axis=1)##mapping same component of wcc or not on traindf_final_test['same_comp'] = df_final_test.apply(lambda row: belongs_to_same_wcc(row['Parent_process_encoded'],row['New_process_encoded']),axis=1)#saving to disk beacuse above operation takes much time so at every check point saving to disk df_final_train.to_csv('df_final_train_some1.csv',index=False)df_final_test.to_csv('df_final_test_some1.csv',index=False)df_final_train = pd.read_csv('df_final_train_some1.csv')df_final_test = pd.read_csv('df_final_test_some1.csv')#mapping shortest path on train df_final_train['shortest_path'] = df_final_train.apply(lambda row: compute_shortest_path_length(row['Parent_process_encoded'],row['New_process_encoded']),axis=1)#mapping shortest path on testdf_final_test['shortest_path'] = df_final_test.apply(lambda row: compute_shortest_path_length(row['Parent_process_encoded'],row['New_process_encoded']),axis=1)df_final_train.to_csv('df_final_train_some2.csv',index=False)df_final_test.to_csv('df_final_test_some2.csv',index=False)#page rank for source and destination in Train#if anything not there in train graph then adding mean page rank df_final_train['page_rank_s'] = df_final_train.Parent_process_encoded.apply(lambda x:pr.get(x,mean_pr))df_final_train['page_rank_d'] = df_final_train.New_process_encoded.apply(lambda x:pr.get(x,mean_pr))#page rank for source and destination in Test#if anything not there in train graph then adding mean page rank df_final_test['page_rank_s'] = df_final_test.Parent_process_encoded.apply(lambda x:pr.get(x,mean_pr))df_final_test['page_rank_d'] = df_final_test.New_process_encoded.apply(lambda x:pr.get(x,mean_pr))#Katz centrality score for source and destination in Train and test#if anything not there in train graph then adding mean katz scoredf_final_train['katz_s'] = df_final_train.Parent_process_encoded.apply(lambda x: katz.get(x,mean_katz))df_final_train['katz_d'] = df_final_train.New_process_encoded.apply(lambda x: katz.get(x,mean_katz))df_final_test['katz_s'] = df_final_test.Parent_process_encoded.apply(lambda x: katz.get(x,mean_katz))df_final_test['katz_d'] = df_final_test.New_process_encoded.apply(lambda x: katz.get(x,mean_katz))#Hits algorithm score for source and destination in Train #if anything not there in train graph then adding 0df_final_train['hubs_s'] = df_final_train.Parent_process_encoded.apply(lambda x: hits[0].get(x,0))df_final_train['hubs_d'] = df_final_train.New_process_encoded.apply(lambda x: hits[0].get(x,0))df_final_train['authorities_s'] = df_final_train.Parent_process_encoded.apply(lambda x: hits[1].get(x,0))df_final_train['authorities_d'] = df_final_train.New_process_encoded.apply(lambda x: hits[1].get(x,0))#Hits algorithm score for source and destination in Test #if anything not there in train graph then adding 0df_final_test['hubs_s'] = df_final_test.Parent_process_encoded.apply(lambda x: hits[0].get(x,0))df_final_test['hubs_d'] = df_final_test.New_process_encoded.apply(lambda x: hits[0].get(x,0))df_final_test['authorities_s'] = df_final_test.Parent_process_encoded.apply(lambda x: hits[1].get(x,0))df_final_test['authorities_d'] = df_final_test.New_process_encoded.apply(lambda x: hits[1].get(x,0))#dependent varible i.e link exist or notdf_final_train['indicator_link'] = y_final_traindf_final_test['indicator_link'] = y_final_testdf_final_train.to_csv('train_df.csv',index=False)df_final_test.to_csv('test_df.csv',index=False)df_final_train = pd.read_csv('train_df.csv')df_final_test = pd.read_csv('test_df.csv')#for svd features to get feature vector creating a dict node val and inex in svd vectorsadj_col = sorted(g.nodes())sadj_dict = { val:idx for idx,val in enumerate(sadj_col)}del sadj_col##creating two df for U valuesfor train and test data with two columns source and destination #and each column will have a list of 6 svd featurestrain_df_svd_u = pd.DataFrame()test_df_svd_u = pd.DataFrame()def svd_s(x): try: z = sadj_dict[x] return U[z] except: return [0,0,0]train_df_svd_u['features_s'] = df_final_train.Parent_process_encoded.apply(lambda x: svd_s(x))train_df_svd_u['features_d'] = df_final_train.New_process_encoded.apply(lambda x: svd_s(x))test_df_svd_u['features_s'] = df_final_test.Parent_process_encoded.apply(lambda x: svd_s(x))test_df_svd_u['features_d'] = df_final_test.New_process_encoded.apply(lambda x: svd_s(x))##creating two df for V valuesfor train and test data with two columns source and destination #and each column will have a list of 6 svd featurestrain_df_svd_v = pd.DataFrame()test_df_svd_v = pd.DataFrame()def svd_v(x): try: z = sadj_dict[x] return V.T[z] except: return [0,0,0] train_df_svd_v['features_s'] = df_final_train.Parent_process_encoded.apply(lambda x: svd_v(x))train_df_svd_v['features_d'] = df_final_train.New_process_encoded.apply(lambda x: svd_v(x))test_df_svd_v['features_s'] = df_final_test.Parent_process_encoded.apply(lambda x: svd_v(x))test_df_svd_v['features_d'] = df_final_test.New_process_encoded.apply(lambda x: svd_v(x))train_df_svd_u.index = df_final_train.indextrain_df_svd_v.index = df_final_train.indextest_df_svd_u.index = df_final_test.indextest_df_svd_v.index = df_final_test.index#Splitting those each one column into 6 features #https://stackoverflow.com/questions/35491274/pandas-split-column-of-lists-into-multiple-columnsdf_final_train[['svd_u_s_1', 'svd_u_s_2','svd_u_s_3']] = \ pd.DataFrame(train_df_svd_u.features_s.values.tolist(), index= df_final_train.index)df_final_test[['svd_u_s_1', 'svd_u_s_2','svd_u_s_3']] = \ pd.DataFrame(test_df_svd_u.features_s.values.tolist(), index= df_final_test.index)df_final_train[['svd_u_d_1', 'svd_u_d_2', 'svd_u_d_3']] = \ pd.DataFrame(train_df_svd_u.features_d.values.tolist(), index= df_final_train.index)df_final_test[['svd_u_d_1', 'svd_u_d_2', 'svd_u_d_3']] = \ pd.DataFrame(test_df_svd_u.features_d.values.tolist(), index= df_final_test.index)del train_df_svd_udel test_df_svd_u#Splitting those each one column into 6 features #https://stackoverflow.com/questions/35491274/pandas-split-column-of-lists-into-multiple-columnsdf_final_train[['svd_v_s_1','svd_v_s_2', 'svd_v_s_3']] = \ pd.DataFrame(train_df_svd_v.features_s.values.tolist(), index= df_final_train.index)df_final_test[['svd_v_s_1','svd_v_s_2', 'svd_v_s_3']] = \ pd.DataFrame(test_df_svd_v.features_s.values.tolist(), index= df_final_test.index)df_final_train[['svd_v_d_1', 'svd_v_d_2', 'svd_v_d_3']] = \ pd.DataFrame(train_df_svd_v.features_d.values.tolist(), index= df_final_train.index)df_final_test[['svd_v_d_1', 'svd_v_d_2', 'svd_v_d_3']] = \ pd.DataFrame(test_df_svd_v.features_d.values.tolist(), index= df_final_test.index)del train_df_svd_vdel test_df_svd_vdel Udel Vdel sadj_dict#data framedf_final_train.columnsdf_final_train.to_csv('train_df_final.csv',index=False)df_final_test.to_csv('test_df_final.csv',index=False)df_final_train = pd.read_csv('train_df_final.csv')df_final_test = pd.read_csv('test_df_final.csv')df_final_train.columns#dependent varibley_train = df_final_train.indicator_link y_test = df_final_test.indicator_link#dropping some columnsdf_final_train.drop(['Parent_process_encoded', 'New_process_encoded','indicator_link'],axis=1,inplace=True)df_final_test.drop(['Parent_process_encoded', 'New_process_encoded','indicator_link'],axis=1,inplace=True)df_final_train.columnsdf_final_test.columnsprint('Train Shape',df_final_train.shape)print('Test Shape',df_final_test.shape)from sklearn.ensemble import RandomForestClassifierfrom sklearn.metrics import f1_scoreestimators = [2,4,8,16,32]train_scores = []test_scores = []for i in estimators: clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=5, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=52, min_samples_split=120, min_weight_fraction_leaf=0.0, n_estimators=i, n_jobs=-1,random_state=25,verbose=0,warm_start=False) clf.fit(df_final_train,y_train) train_sc = f1_score(y_train,clf.predict(df_final_train)) test_sc = f1_score(y_test,clf.predict(df_final_test)) test_scores.append(test_sc) train_scores.append(train_sc) print('Estimators = ',i,'Train Score',train_sc,'test Score',test_sc)plt.plot(estimators,train_scores,label='Train Score')plt.plot(estimators,test_scores,label='Test Score')plt.xlabel('Estimators')plt.ylabel('Score')plt.title('Estimators vs score at depth of 5')depths = [2,3,5]train_scores = []test_scores = []for i in depths: clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=i, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=52, min_samples_split=120, min_weight_fraction_leaf=0.0, n_estimators=115, n_jobs=-1,random_state=25,verbose=0,warm_start=False) clf.fit(df_final_train,y_train) train_sc = f1_score(y_train,clf.predict(df_final_train)) test_sc = f1_score(y_test,clf.predict(df_final_test)) test_scores.append(test_sc) train_scores.append(train_sc) print('depth = ',i,'Train Score',train_sc,'test Score',test_sc)plt.plot(depths,train_scores,label='Train Score')plt.plot(depths,test_scores,label='Test Score')plt.xlabel('Depth')plt.ylabel('Score')plt.title('Depth vs score at depth of 5 at estimators = 115')plt.show()from sklearn.metrics import f1_scorefrom sklearn.ensemble import RandomForestClassifierfrom sklearn.metrics import f1_scorefrom sklearn.model_selection import RandomizedSearchCVfrom scipy.stats import randint as sp_randintfrom scipy.stats import uniformparam_dist = {"n_estimators":sp_randint(105,125), "max_depth": sp_randint(1,5), "min_samples_split": sp_randint(110,190), "min_samples_leaf": sp_randint(25,65)}clf = RandomForestClassifier(random_state=25,n_jobs=-1)rf_random = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=5,cv=10,scoring='f1',random_state=25)rf_random.fit(df_final_train,y_train)import picklepickle.dump(rf_random,open('rf_random_2.p','wb'))rf_random.cv_results_print('mean test scores',rf_random.cv_results_['mean_test_score'])#print('mean train scores',rf_random.cv_results_['mean_train_score'])clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=14, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=28, min_samples_split=111, min_weight_fraction_leaf=0.0, n_estimators=121, n_jobs=-1, oob_score=False, random_state=25, verbose=0, warm_start=False)clf.fit(df_final_train,y_train)import picklepickle.dump(clf,open('clf_rf.p','wb'))from sklearn.metrics import confusion_matrixdef plot_confusion_matrix(test_y, predict_y): C = confusion_matrix(test_y, predict_y) A =(((C.T)/(C.sum(axis=1))).T) B =(C/C.sum(axis=0)) plt.figure(figsize=(20,4)) labels = [0,1] # representing A in heatmap format cmap=sns.light_palette("blue") plt.subplot(1, 3, 1) sns.heatmap(C, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels) plt.xlabel('Predicted Class') plt.ylabel('Original Class') plt.title("Confusion matrix") plt.subplot(1, 3, 2) sns.heatmap(B, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels) plt.xlabel('Predicted Class') plt.ylabel('Original Class') plt.title("Precision matrix") plt.subplot(1, 3, 3) # representing B in heatmap format sns.heatmap(A, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels) plt.xlabel('Predicted Class') plt.ylabel('Original Class') plt.title("Recall matrix") plt.show()y_pred_train = clf.predict(df_final_train)y_pred_test = clf.predict(df_final_test)print('Train confusion_matrix')plot_confusion_matrix(y_train,y_pred_train)print('Test confusion_matrix')plot_confusion_matrix(y_test,y_pred_test)print('Train f1 score',f1_score(y_train,y_pred_train))print('Test f1 score',f1_score(y_test,y_pred_test))from sklearn.metrics import roc_curve, aucfpr,tpr,ths = roc_curve(y_test,y_pred_test)auc_sc = auc(fpr, tpr)plt.plot(fpr, tpr, color='navy',label='ROC curve (area = %0.2f)' % auc_sc)plt.xlabel('False Positive Rate')plt.ylabel('True Positive Rate')plt.title('Receiver operating characteristic with test data')plt.legend()plt.show()features = df_final_train.columnsimportances = clf.feature_importances_indices = (np.argsort(importances))[-15:]plt.figure(figsize=(8,10))plt.title('Feature Importances')plt.barh(range(len(indices)), importances[indices], color='b', align='center')plt.yticks(range(len(indices)), [features[i] for i in indices])plt.xlabel('Relative Importance')plt.show()df.head()unique_proc = pd.DataFrame({'process_count' : df.groupby(['Rank Name','Sub Domain']).size()}).reset_index()A = list(unique_proc["Rank Name"].unique())#B = list(unique_proc["Sub Domain"].unique())node_list = set(A)G = nx.Graph()for i in node_list: G.add_node(i) for i,j in unique_proc.iterrows(): G.add_edges_from([(j["Rank Name"],j["Sub Domain"])])pos = nx.spring_layout(G, k=0.5, iterations=50)for n, p in pos.items(): G.nodes[n]['pos'] = p edge_trace = go.Scatter( x=[], y=[], line=dict(width=0.8,color='#888'), hoverinfo='none', mode='lines')for edge in G.edges(): x0, y0 = G.nodes[edge[0]]['pos'] x1, y1 = G.nodes[edge[1]]['pos'] edge_trace['x'] += tuple([x0, x1, None]) edge_trace['y'] += tuple([y0, y1, None]) node_trace = go.Scatter( x=[], y=[], text=[], mode='markers', hoverinfo='text', marker=dict( showscale=True, colorscale='RdBu', reversescale=True, color=[], size=15, colorbar=dict( thickness=10, title='Node Connections', xanchor='left', titleside='right' ), line=dict(width=0)))for node in G.nodes(): x, y = G.nodes[node]['pos'] node_trace['x'] += tuple([x]) node_trace['y'] += tuple([y]) for node, adjacencies in enumerate(G.adjacency()): node_trace['marker']['color']+=tuple([len(adjacencies[1])]) node_info = adjacencies[0] +' # of connections: '+str(len(adjacencies[1])) node_trace['text']+=tuple([node_info]) fig = go.Figure(data=[edge_trace, node_trace], layout=go.Layout( title='Process network connections', titlefont=dict(size=16), showlegend=False, hovermode='closest', margin=dict(b=20,l=5,r=5,t=40), annotations=[ dict( text="No. of connections", showarrow=False, xref="paper", yref="paper") ], xaxis=dict(showgrid=False, zeroline=False, showticklabels=False), yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))iplot(fig)def edge_to_remove(graph): G_dict = nx.edge_betweenness_centrality(graph) edge = () # extract the edge with highest edge betweenness centrality score for key, value in sorted(G_dict.items(), key=lambda item: item[1], reverse = True): edge = key break return edgedef girvan_newman(graph): # find number of connected components sg = nx.connected_components(graph) sg_count = nx.number_connected_components(graph) while(sg_count == 1): graph.remove_edge(edge_to_remove(graph)[0], edge_to_remove(graph)[1]) sg = nx.connected_components(graph) sg_count = nx.number_connected_components(graph) return sg# find communities in the graphc = girvan_newman(G.copy())# find the nodes forming the communitiesnode_groups = []for i in c: node_groups.append(list(i))len(node_groups)plt.figure(figsize=(15,6))# plot the communitiescolor_map = []for node in G: if node in node_groups[0]: color_map.append('blue') elif node in node_groups[1]: color_map.append('green') elif node in node_groups[2]: color_map.append('yellow') else: color_map.append('red')#print(color_map)nx.draw(G, node_color=color_map, with_labels=True)plt.show()import numpy as npimport matplotlib.pyplot as pltimport networkx as nximport pandas as pdimport communityimport matplotlibimport matplotlib.pylab as plimport matplotlib.gridspec as gridspecimport matplotlib.colors as mcolorsimport matplotlib.patches as mpatchesfrom collections import Counter# Centrality Metrics# Calculating Centrality metrics for the Graphdict_degree_centrality = nx.degree_centrality(G)dict_closeness_centrality = nx.closeness_centrality(G)dict_eigenvector_centrality = nx.eigenvector_centrality(G)# Top 10 nodes with the largest values of degree centrality in descending orderdict(Counter(dict_degree_centrality).most_common(10))# Top 10 nodes with the largest values of closeness centrality in descending orderdict(Counter(dict_closeness_centrality).most_common(10))# Top 10 nodes with the largest values of eigenvector centrality in descending orderdict(Counter(dict_eigenvector_centrality).most_common(10))# Function to plot the graphs for each centrality metricmatplotlib.rcParams['figure.figsize']= [24, 8]def draw(G, pos, lista, listb, measure_name): nodes=nx.draw_networkx_nodes(G, pos, node_size=100, cmap=plt.cm.viridis,node_color=lista,nodelist=listb) nodes.set_norm(mcolors.SymLogNorm(linthresh=0.01, linscale=1)) edges=nx.draw_networkx_edges(G, pos) plt.title(measure_name, fontsize=22, fontname='Arial') plt.colorbar(nodes) plt.axis('off')plt.subplot(1,3,1)list_pos_values = []for i in nx.degree_centrality(G).values(): list_pos_values.append(i) list_pos_keys=[]for i in nx.degree_centrality(G).keys(): list_pos_keys.append(i)draw(G, pos, list_pos_values, list_pos_keys, 'Degree Centrality')plt.subplot(1,3,2)list_pos_values=[]for i in nx.closeness_centrality(G).values(): list_pos_values.append(i) list_pos_keys=[]for i in nx.closeness_centrality(G).keys(): list_pos_keys.append(i)draw(G, pos, list_pos_values, list_pos_keys, 'Closeness Centrality')plt.subplot(1,3,3)list_pos_values=[]for i in nx.eigenvector_centrality(G).values(): list_pos_values.append(i) list_pos_keys=[]for i in nx.eigenvector_centrality(G).keys(): list_pos_keys.append(i)draw(G, pos, list_pos_values, list_pos_keys, 'Eigenvector Centrality')plt.savefig('centrality_summary.png' , dpi=400)#!pip install python-louvainfrom networkx.algorithms.community.centrality import girvan_newmanfrom community import community_louvain# Starting with an initial partition of the graph and running the Louvain algorithm for Community Detectionpartition=community_louvain.best_partition(G, weight='process_count')print('Completed Louvain algorithm .. . . ' )values=[partition.get(node) for node in G.nodes()]list_com=partition.values()# Creating a dictionary like {community_number:list_of_participants}dict_nodes={}# Populating the dictionary with itemsfor each_item in partition.items(): community_num=each_item[1] community_node=each_item[0] if community_num in dict_nodes: value=dict_nodes.get(community_num) + ' | ' + str(community_node) dict_nodes.update({community_num:value}) else: dict_nodes.update({community_num:community_node})# Creating a dataframe from the diet, and getting the output into excelcommunity_df=pd.DataFrame.from_dict(dict_nodes, orient='index',columns=['Members'])community_df.index.rename('Community_Num' , inplace=True)community_df.to_csv('Community_List_snippet.csv')# Creating a new graph to represent the communities created by the Louvain algorithmmatplotlib.rcParams['figure.figsize']= [12, 8]G_comm=nx.Graph()# Populating the data from the node dictionary created earlierG_comm.add_nodes_from(dict_nodes)# Calculating modularity and the total number of communitiesmod=community_louvain.modularity(partition,G)print("Modularity: ", mod)print("Total number of Communities=", len(G_comm.nodes()))# Creating the Graph and also calculating Modularitymatplotlib.rcParams['figure.figsize']= [12, 8]pos_louvain=nx.spring_layout(G_comm)nx.draw_networkx(G_comm, pos_louvain, with_labels=True,node_size=160,font_size=11,label='Modularity =' + str(round(mod,3)) + ', Communities=' + str(len(G_comm.nodes())))plt.suptitle('Community structure (Louvain Algorithm)',fontsize=22,fontname='Arial')plt.box(on=None)plt.axis('off')plt.legend(bbox_to_anchor=(0,1), loc='best', ncol=1)plt.savefig('louvain.png',dpi=400, bbox_inches='tight')# Viewing the list of communitiescommunity_df# Now we try to obtain the color coded graph for each communitynx.draw_networkx(G, pos, cmap=plt.get_cmap('magma'), node_color=values,node_size=30, with_labels=False)plt.suptitle('Louviin Algorithm Community Structure',fontsize=22)plt.box(on=None)plt.axis('off')plt.savefig('louvain_2_ey.png',dpi=400, bbox_inches='tight')plt.show()unique_procFG = nx.from_pandas_edgelist(unique_proc, source='Rank Name', target='Sub Domain', edge_attr='process_count')nx.algorithms.degree_centrality(FG)nx.average_degree_connectivity(FG) # For a node of degree k - What is the average of its neighbours' degree?x
'''# Let us find all the paths availablefor path in nx.all_simple_paths(FG, source='Rank 1', target='Rank 10'): print(path)'''x
# Let us find the dijkstra path from svchost to sensecncproxy.# You can read more in-depth on how dijkstra works from this resource - https://courses.csail.mit.edu/6.006/fall11/lectures/lecture16.pdfdijpath = nx.dijkstra_path(FG, source='Rank 1', target='Rank 4')dijpath# Let us try to find the dijkstra path weighted by count1 (approximate case)shortpath = nx.dijkstra_path(FG, source='Rank 10', target='Rank 15', weight='process_count')shortpath[n for n in FG.neighbors('Rank 11')]# Define find_nodes_with_highest_deg_cent()def find_nodes_with_highest_deg_cent(G): # Compute the degree centrality of G: deg_cent deg_cent = nx.degree_centrality(G) # Compute the maximum degree centrality: max_dc max_1_dc = max(list(deg_cent.values())) max_2_dc = list(sorted(deg_cent.values()))[-2] max_3_dc = list(sorted(deg_cent.values()))[-3] maxnode1 = set() maxnode2 = set() maxnode3 = set() # Iterate over the degree centrality dictionary for k, v in deg_cent.items(): # Check if the current value has the maximum degree centrality if v == max_1_dc: # Add the current node to the set of nodes maxnode1.add(k) if v == max_2_dc: # Add the current node to the set of nodes maxnode2.add(k) if v == max_3_dc: # Add the current node to the set of nodes maxnode3.add(k) return maxnode1,maxnode2,maxnode3top_deg_dc,top2_deg_dc,top3_deg_dc = find_nodes_with_highest_deg_cent(FG)print(top_deg_dc,top2_deg_dc,top3_deg_dc)# Define find_nodes_with_highest_deg_cent()def find_nodes_with_highest_betw_cent(G): # Compute the degree centrality of G: deg_cent betw_cent = nx.betweenness_centrality(G) # Compute the maximum degree centrality: max_dc max_1_dc = max(list(betw_cent.values())) max_2_dc = list(sorted(betw_cent.values()))[-2] max_3_dc = list(sorted(betw_cent.values()))[-3] maxnode1 = set() maxnode2 = set() maxnode3 = set() # Iterate over the degree centrality dictionary for k, v in betw_cent.items(): # Check if the current value has the maximum degree centrality if v == max_1_dc: # Add the current node to the set of nodes maxnode1.add(k) if v == max_2_dc: # Add the current node to the set of nodes maxnode2.add(k) if v == max_3_dc: # Add the current node to the set of nodes maxnode3.add(k) return maxnode1,maxnode2,maxnode3top_betw_dc,top2_betw_dc,top3_betw_dc = find_nodes_with_highest_betw_cent(FG)print(top_betw_dc,top2_betw_dc,top3_betw_dc)cc=nx.average_clustering(FG) print(cc)c=nx.clustering(FG) print(c)# returns a Dictionary with clustering value of each node print(nx.clustering(FG)) # This returns clustering value of specified node print(nx.clustering(FG, 'Rank 1')) # returns True or False whether Graph is connected print(nx.is_connected(FG)) # returns number of different connected components print(nx.number_connected_components(FG)) # returns list of nodes in different connected components print(list(nx.connected_components(FG))) # returns list of nodes of component containing given node print(nx.node_connected_component(FG, 'Rank 1')) # returns number of nodes to be removed # so that Graph becomes disconnected print(nx.node_connectivity(FG)) # returns number of edges to be removed # so that Graph becomes disconnected print(nx.edge_connectivity(FG)) print("Eccentricity: ", nx.eccentricity(FG)) print("Diameter: ", nx.diameter(FG)) print("Radius: ", nx.radius(FG)) print("Preiphery: ", list(nx.periphery(FG))) print("Center: ", list(nx.center(FG))) x
def common_neighbors(g, edges): result = [] for edge in edges: node_one, node_two = edge[0], edge[1] num_common_neighbors = 0 try: neighbors_one, neighbors_two = g.neighbors(node_one), g.neighbors(node_two) for neighbor in neighbors_one: if neighbor in neighbors_two: num_common_neighbors += 1 result.append((node_one, node_two, num_common_neighbors)) except: pass return resultcommon_neighbors(g, edges)def CommonNeighbors(u, v, g): u_neighbors = set(g.neighbors(u)) v_neighbors = set(g.neighbors(v)) return len(u_neighbors.intersection(v_neighbors))def AdamicAdar(u, v, g): u_neighbors = set(g.neighbors(u)) v_neighbors = set(g.neighbors(v)) aa = 0 for i in u_neighbors.intersection(v_neighbors): aa += 1 / math.log(len(g.neighbors(i))) return aadef ResourceAllocation(u, v, g): u_neighbors = set(g.neighbors(u)) v_neighbors = set(g.neighbors(v)) ra = 0 for i in u_neighbors.intersection(v_neighbors): ra += 1 / float(len(g.neighbors(i))) return radef JaccardCoefficent(u, v, g): u_neighbors = set(g.neighbors(u)) v_neighbors = set(g.neighbors(v)) return len(u_neighbors.intersection(v_neighbors)) / float(len(u_neighbors.union(v_neighbors)))def PreferentialAttachment(u, v, g): return len(g.neighbors(u))*len(g.neighbors(v))def AllFeatures(u,v,g1, g2): ''' the change of features in two consecutive sub graphs ''' try: cn = CommonNeighbors(u, v, g2) aa = AdamicAdar(u, v, g2) ra = ResourceAllocation(u, v, g2) jc = JaccardCoefficent(u, v, g2) pa = PreferentialAttachment(u, v, g2) delta_cn = cn - CommonNeighbors(u, v, g1) delta_aa = aa - AdamicAdar(u, v, g1) delta_ra = ra - ResourceAllocation(u, v, g1) delta_jc = jc - JaccardCoefficent(u, v, g1) delta_pa = pa - PreferentialAttachment(u, v, g1) return {"cn":cn, "aa": aa, "ra":ra, "jc":jc, "pa":pa, "delta_cn": delta_cn, "delta_aa": delta_aa, "delta_ra": delta_ra, "delta_jc": delta_jc, "delta_pa": delta_pa} except: pass